{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "24ae5547",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = '0'\n",
    "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3395870c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ubuntu/.local/lib/python3.8/site-packages/apex/pyprof/__init__.py:5: FutureWarning: pyprof will be removed by the end of June, 2022\n",
      "  warnings.warn(\"pyprof will be removed by the end of June, 2022\", FutureWarning)\n"
     ]
    }
   ],
   "source": [
    "from transformers import T5Config, TFT5Model, T5Model, load_tf_weights_in_t5, T5Tokenizer\n",
    "from transformers import T5ForConditionalGeneration, TFT5ForConditionalGeneration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "06db4241",
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -rf temp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a541b0ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "out = 'temp'\n",
    "os.makedirs(out, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "48753c21",
   "metadata": {},
   "outputs": [],
   "source": [
    "config = T5Config.from_pretrained('malay-huggingface/t5-small-bahasa-cased')\n",
    "config.save_pretrained(out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "50a47b1d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "checkpoint\r\n",
      "events.out.tfevents.1658960344.huseincomel-desktop\r\n",
      "graph.pbtxt\r\n",
      "model.ckpt-1004200.data-00000-of-00002\r\n",
      "model.ckpt-1004200.data-00001-of-00002\r\n",
      "model.ckpt-1004200.index\r\n",
      "model.ckpt-1004200.meta\r\n",
      "model.ckpt-1021200.data-00000-of-00002\r\n",
      "model.ckpt-1021200.data-00001-of-00002\r\n",
      "model.ckpt-1021200.index\r\n",
      "model.ckpt-1021200.meta\r\n",
      "model.ckpt-1022200.data-00000-of-00002\r\n",
      "model.ckpt-1022200.data-00001-of-00002\r\n",
      "model.ckpt-1022200.index\r\n",
      "model.ckpt-1022200.meta\r\n",
      "model.ckpt-1023200.data-00000-of-00002\r\n",
      "model.ckpt-1023200.data-00001-of-00002\r\n",
      "model.ckpt-1023200.index\r\n",
      "model.ckpt-1023200.meta\r\n",
      "model.ckpt-1024200.data-00000-of-00002\r\n",
      "model.ckpt-1024200.data-00001-of-00002\r\n",
      "model.ckpt-1024200.index\r\n",
      "model.ckpt-1024200.meta\r\n",
      "model.ckpt-1025200.data-00000-of-00002\r\n",
      "model.ckpt-1025200.data-00001-of-00002\r\n",
      "model.ckpt-1025200.index\r\n",
      "model.ckpt-1025200.meta\r\n",
      "model.ckpt-1026200.data-00000-of-00002\r\n",
      "model.ckpt-1026200.data-00001-of-00002\r\n",
      "model.ckpt-1026200.index\r\n",
      "model.ckpt-1026200.meta\r\n",
      "model.ckpt-793200.data-00000-of-00002\r\n",
      "model.ckpt-793200.data-00001-of-00002\r\n",
      "model.ckpt-793200.index\r\n",
      "model.ckpt-793200.meta\r\n",
      "model.ckpt-811200.data-00000-of-00002\r\n",
      "model.ckpt-811200.data-00001-of-00002\r\n",
      "model.ckpt-811200.index\r\n",
      "model.ckpt-811200.meta\r\n",
      "model.ckpt-828200.data-00000-of-00002\r\n",
      "model.ckpt-828200.data-00001-of-00002\r\n",
      "model.ckpt-828200.index\r\n",
      "model.ckpt-828200.meta\r\n",
      "model.ckpt-846200.data-00000-of-00002\r\n",
      "model.ckpt-846200.data-00001-of-00002\r\n",
      "model.ckpt-846200.index\r\n",
      "model.ckpt-846200.meta\r\n",
      "model.ckpt-863200.data-00000-of-00002\r\n",
      "model.ckpt-863200.data-00001-of-00002\r\n",
      "model.ckpt-863200.index\r\n",
      "model.ckpt-863200.meta\r\n",
      "model.ckpt-881200.data-00000-of-00002\r\n",
      "model.ckpt-881200.data-00001-of-00002\r\n",
      "model.ckpt-881200.index\r\n",
      "model.ckpt-881200.meta\r\n",
      "model.ckpt-898200.data-00000-of-00002\r\n",
      "model.ckpt-898200.data-00001-of-00002\r\n",
      "model.ckpt-898200.index\r\n",
      "model.ckpt-898200.meta\r\n",
      "model.ckpt-916200.data-00000-of-00002\r\n",
      "model.ckpt-916200.data-00001-of-00002\r\n",
      "model.ckpt-916200.index\r\n",
      "model.ckpt-916200.meta\r\n",
      "model.ckpt-934200.data-00000-of-00002\r\n",
      "model.ckpt-934200.data-00001-of-00002\r\n",
      "model.ckpt-934200.index\r\n",
      "model.ckpt-934200.meta\r\n",
      "model.ckpt-951200.data-00000-of-00002\r\n",
      "model.ckpt-951200.data-00001-of-00002\r\n",
      "model.ckpt-951200.index\r\n",
      "model.ckpt-951200.meta\r\n",
      "model.ckpt-969200.data-00000-of-00002\r\n",
      "model.ckpt-969200.data-00001-of-00002\r\n",
      "model.ckpt-969200.index\r\n",
      "model.ckpt-969200.meta\r\n",
      "model.ckpt-986200.data-00000-of-00002\r\n",
      "model.ckpt-986200.data-00001-of-00002\r\n",
      "model.ckpt-986200.index\r\n",
      "model.ckpt-986200.meta\r\n",
      "operative_config.gin\r\n"
     ]
    }
   ],
   "source": [
    "!ls t5-small-noisy-en-ms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f54394ad",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "T5Model(\n",
       "  (shared): Embedding(32128, 512)\n",
       "  (encoder): T5Stack(\n",
       "    (embed_tokens): Embedding(32128, 512)\n",
       "    (block): ModuleList(\n",
       "      (0): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (relative_attention_bias): Embedding(32, 8)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (1): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (2): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (3): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (4): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (5): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (final_layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "    (dropout): Dropout(p=0.1, inplace=False)\n",
       "  )\n",
       "  (decoder): T5Stack(\n",
       "    (embed_tokens): Embedding(32128, 512)\n",
       "    (block): ModuleList(\n",
       "      (0): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (relative_attention_bias): Embedding(32, 8)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerCrossAttention(\n",
       "            (EncDecAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (2): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (1): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerCrossAttention(\n",
       "            (EncDecAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (2): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (2): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerCrossAttention(\n",
       "            (EncDecAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (2): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (3): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerCrossAttention(\n",
       "            (EncDecAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (2): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (4): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerCrossAttention(\n",
       "            (EncDecAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (2): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (5): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerCrossAttention(\n",
       "            (EncDecAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (2): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (final_layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "    (dropout): Dropout(p=0.1, inplace=False)\n",
       "  )\n",
       ")"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = T5Model(config)\n",
    "load_tf_weights_in_t5(model, config, 't5-small-noisy-en-ms/model.ckpt-1026200')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "86b9d748",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "tokenizer = T5Tokenizer('sp10m.cased.ms-en.model', padding = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "7e68dcc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_pretrained(out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "737e7c80",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_gen = T5ForConditionalGeneration.from_pretrained(out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "005b412a",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-08-01 10:50:35.848085: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-08-01 10:50:35.880687: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-08-01 10:50:35.881454: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-08-01 10:50:35.882465: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2022-08-01 10:50:35.883389: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-08-01 10:50:35.884197: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-08-01 10:50:35.884944: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-08-01 10:50:40.244892: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-08-01 10:50:40.245584: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-08-01 10:50:40.246239: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
      "2022-08-01 10:50:40.246870: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.\n",
      "2022-08-01 10:50:40.246888: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 20362 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090 Ti, pci bus id: 0000:01:00.0, compute capability: 8.6\n",
      "2022-08-01 10:50:40.259861: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
      "2022-08-01 10:50:41.011519: I tensorflow/stream_executor/cuda/cuda_blas.cc:1760] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.\n",
      "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFT5ForConditionalGeneration: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']\n",
      "- This IS expected if you are initializing TFT5ForConditionalGeneration from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing TFT5ForConditionalGeneration from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.\n"
     ]
    }
   ],
   "source": [
    "model_tf = TFT5ForConditionalGeneration.from_pretrained(out, from_pt = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "fc1a66fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('en-ms-right.test') as fopen:\n",
    "    right = fopen.read().split('\\n')\n",
    "    \n",
    "with open('en-ms-left.test') as fopen:\n",
    "    left = fopen.read().split('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "3124e42d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "T5ForConditionalGeneration(\n",
       "  (shared): Embedding(32128, 512)\n",
       "  (encoder): T5Stack(\n",
       "    (embed_tokens): Embedding(32128, 512)\n",
       "    (block): ModuleList(\n",
       "      (0): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (relative_attention_bias): Embedding(32, 8)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (1): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (2): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (3): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (4): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (5): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (final_layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "    (dropout): Dropout(p=0.1, inplace=False)\n",
       "  )\n",
       "  (decoder): T5Stack(\n",
       "    (embed_tokens): Embedding(32128, 512)\n",
       "    (block): ModuleList(\n",
       "      (0): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (relative_attention_bias): Embedding(32, 8)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerCrossAttention(\n",
       "            (EncDecAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (2): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (1): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerCrossAttention(\n",
       "            (EncDecAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (2): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (2): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerCrossAttention(\n",
       "            (EncDecAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (2): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (3): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerCrossAttention(\n",
       "            (EncDecAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (2): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (4): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerCrossAttention(\n",
       "            (EncDecAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (2): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (5): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerCrossAttention(\n",
       "            (EncDecAttention): T5Attention(\n",
       "              (q): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (k): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (v): Linear(in_features=512, out_features=512, bias=False)\n",
       "              (o): Linear(in_features=512, out_features=512, bias=False)\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (2): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=512, out_features=2048, bias=False)\n",
       "              (wo): Linear(in_features=2048, out_features=512, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (final_layer_norm): FusedRMSNorm(torch.Size([512]), eps=1e-06, elementwise_affine=True)\n",
       "    (dropout): Dropout(p=0.1, inplace=False)\n",
       "  )\n",
       "  (lm_head): Linear(in_features=512, out_features=32128, bias=False)\n",
       ")"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_gen.cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "35abd80e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 69744/69744 [5:12:44<00:00,  3.72it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "batch_size = 1\n",
    "\n",
    "results = []\n",
    "for i in tqdm(range(0, len(left), batch_size)):\n",
    "    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Inggeris ke Melayu: {s}', return_tensors = 'pt')[0]} for s in left[i:i + batch_size]]\n",
    "    padded = tokenizer.pad(input_ids, padding = 'longest')\n",
    "    outputs = model_gen.generate(padded['input_ids'].cuda(), attention_mask = padded['attention_mask'].cuda(), max_length = 1000)\n",
    "    for o in outputs:\n",
    "        results.append(tokenizer.decode(o, skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "9a28a3e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# from tqdm import tqdm\n",
    "\n",
    "# batch_size = 2\n",
    "\n",
    "# results = []\n",
    "# for i in tqdm(range(0, len(left[:1]), batch_size)):\n",
    "#     input_ids = [{'input_ids': tokenizer.encode(f'terjemah Inggeris ke Melayu: {s}', return_tensors = 'pt')[0]} for s in left[i:i + batch_size]]\n",
    "#     padded = tokenizer.pad(input_ids, padding = 'longest')\n",
    "#     outputs = model_gen.generate(**padded, max_length = 1000)\n",
    "#     for o in outputs:\n",
    "#         results.append(tokenizer.decode(o, skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "52c3b550",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Anda tahu bagaimana kisahnya berjalan. Dua orang kulit putih heteroseksual, hampir pasti berjuang dalam hidup mereka. Mereka bertemu satu sama lain, berkumpul, berpisah, berkumpul lagi dan mencari kebahagiaan dan kesempurnaan yang sebenarnya dalam satu sama lain. Akhirnya, mereka boleh mula menjalani kehidupan yang normal! Rom-coms boleh menjadi pelarian yang menyeronokkan, tetapi ini adalah kisah yang diceritakan terlalu kerap, yang terlalu kekurangan dalam kepelbagaian, terlalu bergantung pada stereotaip gender dan terlalu prihatin dengan menjual jenama cinta yang mustahil untuk hidup: kajian tahun 2008 di Universiti Heriot Watt mendapati bahawa rom-coms mempunyai kesan negatif terhadap hubungan, menjadikan kita mengejar standard cinta yang tidak dapat dicapai. Dalam proses menulis drama baru saya Ross & Rachel, yang menghadapi mitos cinta moden dan dibuka di Edinburgh Festival Fringe pada bulan Ogos ini, saya harus banyak memikirkan percintaan dalam fiksyen dari Romeo dan Juliet hingga Pride dan Prejudice hingga ke Notting Hill. Mengapa kita terus menceritakan kisah yang sama?'"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "eb550d6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sacrebleu.metrics import BLEU, CHRF, TER\n",
    "\n",
    "bleu = BLEU()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "20530f31",
   "metadata": {},
   "outputs": [],
   "source": [
    "filtered_left, filtered_right = [], []\n",
    "for no, r in enumerate(results):\n",
    "    if len(r):\n",
    "        filtered_left.append(r)\n",
    "        filtered_right.append(right[no])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "e50b3fbd",
   "metadata": {},
   "outputs": [],
   "source": [
    "refs = [filtered_right]\n",
    "sys = filtered_left"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "2ba0ce1c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# for i in range(len(filtered_left)):\n",
    "#     print(filtered_left[i])\n",
    "#     print(filtered_right[i])\n",
    "#     print(right[i])\n",
    "#     print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "e5d6f81d",
   "metadata": {},
   "outputs": [],
   "source": [
    "r = bleu.corpus_score(sys, refs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "cf03103f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'name': 'BLEU',\n",
       " 'score': 65.00070822235693,\n",
       " '_mean': -1.0,\n",
       " '_ci': -1.0,\n",
       " '_verbose': '84.8/70.5/60.6/52.9 (BP = 0.983 ratio = 0.983 hyp_len = 2585122 ref_len = 2630014)',\n",
       " 'bp': 0.9827843869831318,\n",
       " 'counts': [2191007, 1773499, 1481652, 1255934],\n",
       " 'totals': [2585122, 2515379, 2445659, 2376118],\n",
       " 'sys_len': 2585122,\n",
       " 'ref_len': 2630014,\n",
       " 'precisions': [84.75449127739425,\n",
       "  70.50623385183704,\n",
       "  60.58293490629724,\n",
       "  52.85655005349061],\n",
       " 'prec_str': '84.8/70.5/60.6/52.9',\n",
       " 'ratio': 0.9829308893412735}"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "r.__dict__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "88704b13",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.push_to_hub('t5-small-finetuned-noisy-en-ms', organization='mesolitica')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "97afc0d5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0bc8c66492fa4251bbd50aacb41a664a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Upload file pytorch_model.bin:   0%|          | 4.00k/231M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "To https://huggingface.co/mesolitica/t5-small-finetuned-noisy-en-ms\n",
      "   b9043ce..091d4f3  main -> main\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'https://huggingface.co/mesolitica/t5-small-finetuned-noisy-en-ms/commit/091d4f3b9018f0183c2811f3f8291c0fc2fb575b'"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_gen.push_to_hub('t5-small-finetuned-noisy-en-ms', organization='mesolitica')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "db817c07",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dc33cbee2d4f45a49bf4aee772c78b49",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Upload file tf_model.h5:   0%|          | 4.00k/231M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "To https://huggingface.co/mesolitica/t5-small-finetuned-noisy-en-ms\n",
      "   091d4f3..4877d15  main -> main\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'https://huggingface.co/mesolitica/t5-small-finetuned-noisy-en-ms/commit/4877d15c5540cc61f8f30403c762a9d74ffd600a'"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_tf.push_to_hub('t5-small-finetuned-noisy-en-ms', organization='mesolitica')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "2819f991",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Already up to date.\r\n"
     ]
    }
   ],
   "source": [
    "!cd t5-small-finetuned-noisy-en-ms && git pull"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "df9b8117",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[main b16fde5] add tensorboard\n",
      " 1 file changed, 2 insertions(+), 2 deletions(-)\n",
      "Uploading LFS objects: 100% (1/1), 68 MB | 6.7 MB/s, done.                      \n",
      "Enumerating objects: 5, done.\n",
      "Counting objects: 100% (5/5), done.\n",
      "Delta compression using up to 16 threads\n",
      "Compressing objects: 100% (3/3), done.\n",
      "Writing objects: 100% (3/3), 375 bytes | 375.00 KiB/s, done.\n",
      "Total 3 (delta 1), reused 0 (delta 0)\n",
      "To https://huggingface.co/mesolitica/t5-small-finetuned-noisy-en-ms\n",
      "   4877d15..b16fde5  main -> main\n"
     ]
    }
   ],
   "source": [
    "!cp t5-small-noisy-en-ms/*.tfevents.* t5-small-finetuned-noisy-en-ms\n",
    "!cd t5-small-finetuned-noisy-en-ms && git add . && git commit -m 'add tensorboard' && git push"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
